require(Hmisc)
knitrSet(lang='markdown')

Descriptive Statistics

require(rms)
options(prType='html')
getHdata(FEV)
html(contents(FEV))

Data frame:FEV

654 observations and 6 variables, maximum # NAs:0  
NameUnitsLevelsStorage
idinteger
ageyearsinteger
fevlitersdouble
heightinchesdouble
sex2integer
smoke2integer

Category Levels
sex
  • female
  • male
  • smoke
  • non-current smoker
  • current smoker
  • options(grType='plotly')  # use plotly interactive graphics
    plot(describe(FEV))

    $Categorical

    $Continuous

    require(plotly, quietly=TRUE)
    ggplotly(ggplot(FEV, aes(x=age, y=fev, color=height)) + geom_point() + facet_grid(smoke ~ sex))
    dd <- datadist(FEV); options(datadist='dd')
    htmlVerbatim(dd)    # in Hmisc package
                             id age      fev height    sex              smoke
     Low:effect      15811.0000   8 1.981000   57.0                  
     Adjust to       36071.0000  10 2.547500   61.5   male non-current smoker
     High:effect     53638.5000  12 3.118500   65.5                  
     Low:prediction    600.2355   4 1.252128   49.0 female non-current smoker
     High:prediction 81741.1529  17 4.789642   72.0   male     current smoker
     Low               201.0000   3 0.791000   46.0 female non-current smoker
     High            90001.0000  19 5.793000   74.0   male     current smoker
     
     Values:
     
     sex : female male 
     smoke : non-current smoker current smoker 
     

    Regression Models of Increasing Complexity

    FEV is log-transformed to satisfy normality and equal variance assumptions.

    Single Predictor, Linear Effect

    f <- ols(log(fev) ~ age, data=FEV)
    f
    Linear Regression Model
     ols(formula = log(fev) ~ age, data = FEV)
     
    Model Likelihood
    Ratio Test
    Discrimination
    Indexes
    Obs 654 LR χ2 592.40 R2 0.596
    σ 0.2120 d.f. 1 R2adj 0.595
    d.f. 652 Pr(>χ2) 0.0000 g 0.287
    Residuals
          Min       1Q   Median       3Q      Max 
     -0.72047 -0.13752  0.00302  0.14681  0.60267 
     
    β S.E. t Pr(>|t|)
    Intercept  0.0506  0.0291 1.74 0.0826
    age  0.0871  0.0028 31.00 <0.0001
    r <- as.numeric(resid(f))
    with(FEV, plot(age, r))
    qqnorm(r)
    qqline(r)

    # Show algebraic form of fitted model
    g <- Function(f)
    htmlVerbatim(g, exp(g()), exp(g(age=10)))   # Evaluate the fitted model, antilog to get original scale
     function (age = 10) 
     {
         0.050596002 + 0.087083296 * age
     }
     
     [1] 2.512879
     [1] 2.512879
     
    latex(f, md=TRUE)
    \[{\rm E({\rm log(fev)}}) = X\beta, {\rm \ \ where} \\ \] \begin{eqnarray*} X\hat{\beta}= & & \\ & & 0.050596 +0.0870833\:{\rm age} \\ \end{eqnarray*}
    ggplot(Predict(f), addlayer=geom_point(aes(x=age, y=log(fev), color=sex), FEV))

    Restricted Cubic Spline With 3 Default Knots

    The knots are at these quantiles of age: 0.1, 0.5, 0.9.

    f <- ols(log(fev) ~ rcs(age, 3), data=FEV, x=TRUE)
    print(f)
    Linear Regression Model
     ols(formula = log(fev) ~ rcs(age, 3), data = FEV, x = TRUE)
     
    Model Likelihood
    Ratio Test
    Discrimination
    Indexes
    Obs 654 LR χ2 668.33 R2 0.640
    σ 0.2002 d.f. 2 R2adj 0.639
    d.f. 651 Pr(>χ2) 0.0000 g 0.297
    Residuals
           Min        1Q    Median        3Q       Max 
     -0.608234 -0.134592  0.006764  0.139165  0.538798 
     
    β S.E. t Pr(>|t|)
    Intercept  -0.3377  0.0513 -6.58 <0.0001
    age   0.1386  0.0063 21.88 <0.0001
    age'  -0.0627  0.0070 -8.95 <0.0001
    g <- Function(f)
    htmlVerbatim(g(10:20), g(), g)
      [1] 0.9852999 1.0660775 1.1292243 1.1806172 1.2261332 1.2706697 1.3152062 1.3597427
      [9] 1.4042792 1.4488157 1.4933522
     [1] 0.9852999
     function (age = 10) 
     {
         -0.33768719 + 0.13856744 * age - 0.00097948891 * pmax(age - 
             6, 0)^3 + 0.0019589778 * pmax(age - 10, 0)^3 - 0.00097948891 * 
             pmax(age - 14, 0)^3
     }
     
     
    ggplot(Predict(f))
    ha <- function(f) print(anova(f), dec.ms=2, dec.ss=2)
    ha(f)
    Analysis of Variance for log(fev)
    d.f. Partial SS MS F P
    age 2 46.42 23.21 578.90 <0.0001
    Nonlinear 1 3.21 3.21 80.14 <0.0001
    REGRESSION 2 46.42 23.21 578.90 <0.0001
    ERROR 651 26.10 0.04

    RCS With 5 Default Knots

    f <- ols(log(fev) ~ rcs(age, 5), data=FEV)
    f
    Linear Regression Model
     ols(formula = log(fev) ~ rcs(age, 5), data = FEV)
     
    Model Likelihood
    Ratio Test
    Discrimination
    Indexes
    Obs 654 LR χ2 678.99 R2 0.646
    σ 0.1989 d.f. 4 R2adj 0.644
    d.f. 649 Pr(>χ2) 0.0000 g 0.303
    Residuals
           Min        1Q    Median        3Q       Max 
     -0.621716 -0.135183  0.008372  0.144535  0.543184 
     
    β S.E. t Pr(>|t|)
    Intercept  -0.1758  0.0985 -1.78 0.0748
    age   0.1126  0.0159 7.10 <0.0001
    age’   0.0383  0.0817 0.47 0.6391
    age’’  -0.2167  0.4077 -0.53 0.5951
    age’’’  -0.1496  0.9059 -0.17 0.8689
    Function(f)

    function (age = 10) { -0.17580202 + 0.11261215 * age + 0.0003834753 * pmax(age - 5, 0)^3 - 0.0021674701 * pmax(age - 8, 0)^3 - 0.0014960677 * pmax(age - 10, 0)^3 + 0.0047044691 * pmax(age - 11, 0)^3 - 0.0014244066 * pmax(age - 15, 0)^3 } <environment: 0x55d2767e85f0>

    ha(f)
    Analysis of Variance for log(fev)
    d.f. Partial SS MS F P
    age 4 46.85 11.71 295.97 <0.0001
    Nonlinear 3 3.64 1.21 30.62 <0.0001
    REGRESSION 4 46.85 11.71 295.97 <0.0001
    ERROR 649 25.68 0.04
    ggplot(Predict(f))

    RCS with 5 Knots, Additive (Non-Interacting) Sex Effect

    f <- ols(log(fev) ~ rcs(age, 5) + sex, data=FEV)
    Function(f)

    function (age = 10, sex = “male”) { -0.23344699 + 0.11429274 * age + 0.00019963206 * pmax(age - 5, 0)^3 - 0.0010413455 * pmax(age - 8, 0)^3 - 0.0043083413 * pmax(age - 10, 0)^3 + 0.0067087011 * pmax(age - 11, 0)^3 - 0.0015586464 * pmax(age - 15, 0)^3 + 0.099501651 * (sex == “male”) } <environment: 0x55d27a5566d0>

    ha(f)
    Analysis of Variance for log(fev)
    d.f. Partial SS MS F P
    age 4 46.37 11.59 312.11 <0.0001
    Nonlinear 3 3.68 1.23 32.99 <0.0001
    sex 1 1.61 1.61 43.40 <0.0001
    REGRESSION 5 48.46 9.69 260.92 <0.0001
    ERROR 648 24.07 0.04
    ggplot(Predict(f, age, sex),
           addlayer=geom_point(aes(x=age, y=log(fev), color=sex), FEV))

    RCS in Age Interacting With Sex

    The following model allows for different shapes of effects for males and females.

    f <- ols(log(fev) ~ rcs(age, 5) * sex, data=FEV)
    r <- as.numeric(resid(f))
    plot(fitted(f), r)
    qqnorm(r); qqline(r)
    Function(f)

    function (age = 10, sex = “male”) { -0.33458936 + 0.13366998 * age - 2.9172721e-05 * pmax(age - 5, 0)^3 - 0.0039056026 * pmax(age - 8, 0)^3 + 0.0078871147 * pmax(age - 10, 0)^3 - 0.002951157 * pmax(age - 11, 0)^3 - 0.0010011823 * pmax(age - 15, 0)^3 + 0.34502774 * (sex == “male”) + (sex == “male”) * (-0.046273816 * age + 0.00093632418 * pmax(age - 5, 0)^3 + 0.0034513932 * pmax(age - 8, 0)^3 - 0.020568634 * pmax(age - 10, 0)^3 + 0.017330044 * pmax(age - 11, 0)^3 - 0.0011491273 * pmax(age - 15, 0)^3) } <environment: 0x55d2772d5b38>

    ha(f)

    Analysis of Variance for log(fev)
    d.f. Partial SS MS F P
    age (Factor+Higher Order Factors) 8 48.31 6.04 175.70 <0.0001
    All Interactions 4 1.94 0.48 14.08 <0.0001
    Nonlinear (Factor+Higher Order Factors) 6 4.74 0.79 22.98 <0.0001
    sex (Factor+Higher Order Factors) 5 3.55 0.71 20.65 <0.0001
    All Interactions 4 1.94 0.48 14.08 <0.0001
    age × sex (Factor+Higher Order Factors) 4 1.94 0.48 14.08 <0.0001
    Nonlinear 3 0.82 0.27 7.96 <0.0001
    Nonlinear Interaction : f(A,B) vs. AB 3 0.82 0.27 7.96 <0.0001
    TOTAL NONLINEAR 6 4.74 0.79 22.98 <0.0001
    TOTAL NONLINEAR + INTERACTION 7 5.61 0.80 23.32 <0.0001
    REGRESSION 9 50.39 5.60 162.92 <0.0001
    ERROR 644 22.13 0.03

    ggplot(Predict(f, age, sex))

    Plot predicted median FEV by anti-logging predicted values.

    ggplot(Predict(f, age, sex, fun=exp), ylab='Estimated Median FEV')

    RCS with Age * Sex and Additive Nonlinear Effect of Height

    We show the joint age and height effects using a color image plot

    f <- ols(log(fev) ~ rcs(age, 5) * sex + rcs(height, 5), data=FEV)
    ha(f)
    Analysis of Variance for log(fev)
    d.f. Partial SS MS F P
    age (Factor+Higher Order Factors) 8 1.33 0.17 8.02 <0.0001
    All Interactions 4 0.36 0.09 4.39 0.0016
    Nonlinear (Factor+Higher Order Factors) 6 0.44 0.07 3.51 0.0020
    sex (Factor+Higher Order Factors) 5 0.60 0.12 5.77 <0.0001
    All Interactions 4 0.36 0.09 4.39 0.0016
    height 4 8.88 2.22 107.27 <0.0001
    Nonlinear 3 0.17 0.06 2.77 0.0408
    age × sex (Factor+Higher Order Factors) 4 0.36 0.09 4.39 0.0016
    Nonlinear 3 0.33 0.11 5.37 0.0012
    Nonlinear Interaction : f(A,B) vs. AB 3 0.33 0.11 5.37 0.0012
    TOTAL NONLINEAR 9 0.58 0.06 3.12 0.0011
    TOTAL NONLINEAR + INTERACTION 10 0.59 0.06 2.83 0.0019
    REGRESSION 13 59.28 4.56 220.25 <0.0001
    ERROR 640 13.25 0.02
    ggplot(Predict(f, age, sex), adj.subtitle=FALSE)
    p <- Predict(f, age, height)
    bplot(p)

    p <- with(p, list(age=unique(age), height=unique(height),
                      Yhat=matrix(yhat, ncol=200)))
    with(p, plot_ly(x=age, y=height, z=Yhat, type='heatmap'))
    with(p, plot_ly(x=age, y=height, z=Yhat, type='surface'))

    Model Ignoring Sex and Height But Including Smoking History

    By plotting spike histograms showing the smoking-specific age distribution we see that there are almost no very young children who have smoked. This limits the power to detect an age by smoking interaction.

    f <- ols(log(fev) ~ rcs(age, 5) + smoke, data=FEV)
    ggplot(Predict(f, age, smoke), rdata=FEV)

    Computing Environment

    R version 3.6.1 (2019-07-05)
    Platform: x86_64-pc-linux-gnu (64-bit)
    Running under: Ubuntu 19.04
    
    Matrix products: default
    BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.8.0
    LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.8.0
    
    attached base packages:
    [1] stats     graphics  grDevices utils     datasets  methods   base     
    
    other attached packages:
    [1] plotly_4.9.0      rms_5.1-4         SparseM_1.77      Hmisc_4.2-1      
    [5] ggplot2_3.2.1     Formula_1.2-3     survival_2.44-1.1 lattice_0.20-38  

    R Core Team (2019). R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria. <URL: https://www.R-project.org/>.